# Required packages
import numpy as np
import pandas as pd
import pycountry
import pycountry_convert
import re
# Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from wordcloud import WordCloud
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we visualize the data available from the Kaggle survey in three consecutive years (2017, 2018, and 2019). The results include raw numbers about who is working with data, what’s happening with machine learning in different industries, and the best ways for new data scientists to break into the field. We've published the data in as raw a format as possible without compromising anonymization, which makes it an unusual example of a survey dataset.
Data19 = pd.read_csv('kaggle-survey-2019/multiple_choice_responses.csv', header=1)
Data18 = pd.read_csv('kaggle-survey-2018/multipleChoiceResponses.csv', header=1)
Data17 = pd.read_csv('kaggle-survey-2017/multipleChoiceResponses.csv',encoding='ISO-8859-1')
def Search(Mylist, key): return [s for s in Mylist if key in s]
def Search_df(df, key):
Mylist = df.columns.tolist()
return [s for s in Mylist if key in s]
def Rename_func(df):
return df.rename(columns = {'In which country do you currently reside?':'Country',
'What is your gender? - Selected Choice': 'Gender',
'GenderSelect': 'Gender',
'What is your age (# years)?':'Age Group',
'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?':'FormalEducation',
'Select the title most similar to your current role (or most recent title if retired): - Selected Choice':'CurrentJobTitle',
'CurrentJobTitleSelect':'CurrentJobTitle',
'What is your current yearly compensation (approximate $USD)?':'CurrentSalary',
'What is the size of the company where you are employed?':'CompanySize',
'Approximately how many individuals are responsible for data science workloads at your place of business?':'DataScienceTeamSize'
})
Data19 = Rename_func(Data19)
Data18 = Rename_func(Data18)
Data17 = Rename_func(Data17)
del Rename_func
Cols = {'What is your gender? - Prefer to self-describe - Text',
'Select the title most similar to your current role (or most recent title if retired): - Other - Text',
'Select any activities that make up an important part of your role at work: (Select all that apply) - Other - Text'}
Data18.drop(columns = Cols, inplace = True)
Data19.drop(columns = Cols, inplace = True)
Columns = Data19.columns.tolist()
Select_Cols = Search(Columns, 'Select')
def Countries_func(Col):
return Col.replace(
{'United States of America': 'United States', 'Viet Nam': 'Vietnam', "People 's Republic of China": 'China',
'Republic of China': 'China', "United Kingdom of Great Britain and Northern Ireland": 'United Kingdom',
"Hong Kong (S.A.R.)": 'Hong Kong', 'Republic of Korea': 'South Korea', 'Iran, Islamic Republic of...': 'Iran',
'I do not wish to disclose my location': 'Other'})
Data19['Country'] = Countries_func(Data19['Country'])
Data18['Country'] = Countries_func(Data18['Country'])
Data17['Country'] = Countries_func(Data17['Country'])
del Countries_func
Temp = ['Prefer to self-describe', 'Prefer not to say','Non-binary, genderqueer, or gender non-conforming',
'A different identity', np.nan]
Data17.loc[Data17.Gender.isin(Temp), 'Gender'] = 'Other'
Data18.loc[Data18.Gender.isin(Temp), 'Gender'] = 'Other'
Data19.loc[Data19.Gender.isin(Temp), 'Gender'] = 'Other'
del Temp
def Coutry_Continent(x):
try:
Out = pycountry_convert.country_name_to_country_alpha2(x, cn_name_format="default")
Out = pycountry_convert.country_alpha2_to_continent_code(Out)
Out = pycountry_convert.convert_continent_code_to_continent_name(Out)
except:
Out = np.nan
return Out
Data17['Continent'] = Data17.Country.apply(lambda x: Coutry_Continent(x))
Data18['Continent'] = Data18.Country.apply(lambda x: Coutry_Continent(x))
Data19['Continent'] = Data19.Country.apply(lambda x: Coutry_Continent(x))
del Coutry_Continent
def Age_Group(x):
if 18<= x <= 21: Out = '18-21'
elif 22<= x <= 24: Out = '22-24'
elif 25<= x <= 29: Out = '25-29'
elif 30<= x <= 34: Out = '30-34'
elif 35<= x <= 39: Out = '35-39'
elif 40<= x <= 44: Out = '40-44'
elif 45<= x <= 49: Out = '45-49'
elif 50<= x <= 54: Out = '50-54'
elif 55<= x <= 59: Out = '55-59'
elif 60<= x <= 69: Out = '60-69'
elif 70<= x: Out = '70+'
else: Out = np.nan
return Out
Data17['Age Group'] = Data17['Age'].apply(lambda x: Age_Group(x))
Data18['Age Group'] = Data18['Age Group'].replace({'70-79':'70+', '80+':'70+'})
del Age_Group
def Education_func(Col):
return Col.replace(
{'I did not complete any formal education past high school':'No formal education past high school',
"Bachelor's degree":'Bachelor’s degree',"Master's degree": 'Master’s degree',
"Some college/university study without earning a bachelor's degree":
'Some college/university study without earning a bachelor’s degree'})
Data17['FormalEducation'] = Education_func(Data17['FormalEducation'])
Data18['FormalEducation'] = Education_func(Data18['FormalEducation'])
Data19['FormalEducation'] = Education_func(Data19['FormalEducation'])
Data18.CurrentSalary.replace({'I do not wish to disclose my approximate yearly compensation':np.nan},inplace = True)
Data19.CurrentSalary.replace({'I do not wish to disclose my approximate yearly compensation':np.nan},inplace = True)
Temp = pd.DataFrame({'Year':[2017, 2018, 2019], 'Responses':[Data19.shape[0],Data18.shape[0],Data17.shape[0]]})
fig = px.bar(Temp, y= 'Year', x= 'Responses', orientation='h', text = 'Responses', height= 250)
fig.update_traces(marker_color='lightYellow', marker_line_color='darkRed',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig['layout']['xaxis'].update(range=[0, 25e3])
fig.update_layout(title = 'Number of Responses by Year', plot_bgcolor= 'white')
fig.show()
A quick comparison between the number of responses by year shows that the number of responses in 2018 is the highest.
Top = 10
Colors = ['lavender','steelblue','royalblue']
Temp = Data19.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2019'})
Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2018'})
Temp = Temp.join(Temp0)
Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2017'})
Temp = Temp.join(Temp0)
Temp = Temp.fillna(0).astype(int).reset_index(drop = False)
Temp.sort_values(by=['2019', '2018', '2017'], ascending=False, inplace = True)
Temp['2017Percentage'] = np.round(100*Temp['2017']/Temp['2017'].sum(),2)
Temp['2018Percentage'] = np.round(100*Temp['2018']/Temp['2018'].sum(),2)
Temp['2019Percentage'] = np.round(100*Temp['2019']/Temp['2019'].sum(),2)
Temp = Temp.loc[Temp.Country != 'Other']
Temp = Temp[:Top]
TopCoutries = Temp.Country.tolist()
del Temp0
fig = go.Figure()
fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2019'], name='Responses in 2019', marker_color= Colors[0],
text=Temp['2019Percentage'], textposition='inside', ))
fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2018'], name='Responses in 2018', marker_color= Colors[1],
text=Temp['2018Percentage'], textposition='inside'))
fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2017'], name='Responses in 2017', marker_color= Colors[2],
text=Temp['2017Percentage'], textposition='inside'))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'Number of Responses by Country (Top %i)' % Top, plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
It can be seen that each year, the highest number of responses are from India and the United States.
Temp = Data19.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.loc[~Temp.Country.isin(['Other'])]
Temp['alpha3'] = Temp.Country.apply(lambda x: pycountry_convert.country_name_to_country_alpha3(x, cn_name_format="default"))
fig = px.choropleth(Temp, locations= 'alpha3', color="Count", hover_name="Country",
animation_frame="Year", range_color=[0,5e3], color_continuous_scale="Greens")
fig.show()
Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])
Temp = Data17.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
values=Temp.Count.values,
name= '2017',
textfont=dict(size=16),
marker=dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
Temp = Data18.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
values=Temp.Count.values,
textfont=dict(size=16),
name= '2018'), 1, 2)
Temp = Data19.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
values=Temp.Count.values,
textfont=dict(size=16),
name= '2019'), 1, 3)
fig.update_traces(hole=.6, marker_line_color='black', marker_line_width=1, opacity=1)
fig.update_layout(title="Gender Distribution", font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='2017', x=0.11, y=0.5, font_size=20, showarrow=False),
dict(text='2018', x=0.5, y=0.5, font_size=20, showarrow=False),
dict(text='2019', x=0.88, y=0.5, font_size=20, showarrow=False)])
fig.show()
It can be seen that each year, the majority of the participants are men. This graph can be specified by the country as follows.
Temp = Data19.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp['Percentage'] = 0
for i in Temp.Country.unique():
Temp.loc[Temp.Country == i ,'Percentage'] = np.round(100*Temp.loc[Temp.Country == i ,'count']/Temp.loc[Temp.Country == i ,'count'].sum(),2)
Temp0 = Data18.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp0['Percentage'] = 0
for i in Temp0.Country.unique():
Temp0.loc[Temp0.Country == i ,'Percentage'] = np.round(100*Temp0.loc[Temp0.Country == i ,'count']/Temp0.loc[Temp0.Country == i ,'count'].sum(),2)
Temp = pd.concat([Temp, Temp0])
Temp0 = Data17.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp0['Percentage'] = 0
for i in Temp0.Country.unique():
Temp0.loc[Temp0.Country == i ,'Percentage'] = np.round(100*Temp0.loc[Temp0.Country == i ,'count']/Temp0.loc[Temp0.Country == i ,'count'].sum(),2)
Temp = pd.concat([Temp, Temp0])
del Temp0
Temp = Temp.loc[Temp.Country.isin(TopCoutries)]
Temp.sort_values(by=['Country','Year','Gender'], inplace = True)
Temp.reset_index(drop = False, inplace = True)
# fig = go.Figure()
fig = make_subplots(rows=1, cols=3, subplot_titles=('2017', '2018', '2019'))
Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
Name = ['Male', 'Female', 'Other']
Y = 2017
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', ), row=1, col=1)
Y = 2018
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=2)
Y = 2019
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=3)
fig.update_layout(barmode='relative')
fig.update_traces(marker_line_color='black', marker_line_width= 0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 100])
fig.update_layout(title = 'Number of Responses by Country (Top %i)' % Top, plot_bgcolor= 'white')
fig.show()
The number and percentage of the participants can be analyzed by continent as well.
C = ['deepskyblue','GreenYellow','OrangeRed', 'violet','LimeGreen','Olive']
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])
Temp = Data17.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
.reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
values=Temp.Count.values,
name= '2017',
textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
Temp = Data18.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
.reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
values=Temp.Count.values,
textfont=dict(size=16),
name= '2018'), 1, 2)
Temp = Data19.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
.reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
values=Temp.Count.values,
textfont=dict(size=16),
name= '2019'), 1, 3)
fig.update_traces(hole=.6, marker_line_color='black', marker_line_width=1, opacity=1)
fig.update_layout(title="Responses by Continent", font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='2017', x=0.11, y=0.5, font_size=20, showarrow=False),
dict(text='2018', x=0.5, y=0.5, font_size=20, showarrow=False),
dict(text='2019', x=0.88, y=0.5, font_size=20, showarrow=False)])
fig.show()
Temp = Data17.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp['Percentage'] = 0
for i in Temp.Continent.unique():
Temp.loc[Temp.Continent == i, 'Percentage'] = Temp.loc[Temp.Continent == i, 'count']/Temp.loc[Temp.Continent == i, 'count'].sum()
Temp['Percentage'] = np.round(100* Temp['Percentage'],2)
Temp['Year'] = 2017
Temp0 = Data18.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp0['Percentage'] = 0
for i in Temp0.Continent.unique():
Temp0.loc[Temp0.Continent == i, 'Percentage'] = Temp0.loc[Temp0.Continent == i, 'count']/Temp0.loc[Temp0.Continent == i, 'count'].sum()
Temp0['Percentage'] = np.round(100* Temp0['Percentage'],2)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
Temp0 = Data19.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp0['Percentage'] = 0
for i in Temp0.Continent.unique():
Temp0.loc[Temp0.Continent == i, 'Percentage'] = Temp0.loc[Temp0.Continent == i, 'count']/Temp0.loc[Temp0.Continent == i, 'count'].sum()
Temp0['Percentage'] = np.round(100* Temp0['Percentage'],2)
Temp0['Year'] = 2019
Temp = pd.concat([Temp,Temp0])
Temp.sort_values(by=['Continent','Year','Gender'], inplace = True)
Temp.reset_index(drop = False, inplace = True)
fig = make_subplots(rows=1, cols=3, subplot_titles=('2017', '2018', '2019'))
Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
Name = ['Male', 'Female', 'Other']
Y = 2017
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', ), row=1, col=1)
Y = 2018
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=2)
Y = 2019
for i in range(len(Name)):
Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=3)
fig.update_layout(barmode='relative')
fig.update_traces(marker_line_color='black', marker_line_width= 0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 100])
fig.update_layout(title = 'Number of Responses by Continent', plot_bgcolor= 'white')
fig.show()
fig = go.Figure()
C = ['#9b59b6', '#e74c3c', '#34495e']
Temp = Data17.loc[Data17.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2017',
marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2018',
marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2019',
marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses (North America)', plot_bgcolor= 'white', width=600)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = go.Figure()
Temp = Data17.loc[Data17.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2017',
marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2018',
marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2019',
marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 1e3])
fig.update_layout(title = 'The Number of Responses (Europe)', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = go.Figure()
Temp = Data17.loc[Data17.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2017',
marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2018',
marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp.Country,
y= Temp['count'],
name='Responses in 2019',
marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses (Asia)', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
C = ['#9b59b6', '#e74c3c', '#34495e']
fig = go.Figure()
Temp = Data17.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp['Age Group'],
y= Temp['count'],
name='Responses in 2017',
marker_color= C[0]
))
Temp = Data18.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp['Age Group'],
y= Temp['count'],
name='Responses in 2018',
marker_color= C[1]
))
Temp = Data19.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
x= Temp['Age Group'],
y= Temp['count'],
name='Responses in 2019',
marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 7e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
| Age Categories | Age Groups |
|---|---|
| Youth | 18-24 years |
| Adults | 25-59 years |
| Seniors | 60+ |
Top = 20
Temp0 = Data19.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()[:Top]
Temp = Data19[['Age Group','Country']]
Temp['Age Categories'] = Temp['Age Group'].replace({'18-21': 'Youth', '22-24': 'Youth','25-29':'Adults', '30-34':'Adults',
'35-39':'Adults', '40-44':'Adults', '45-49':'Adults','50-54':'Adults',
'55-59':'Adults', '60-69': 'Seniors', '70+': 'Seniors'})
Temp = Temp.groupby(['Country','Age Categories'])['Age Categories'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp['Sort'] = 0
Temp.loc[Temp['Age Categories'] == 'Youth', 'Sort'] = 1
Temp.loc[Temp['Age Categories'] == 'Adults', 'Sort'] = 2
Temp.loc[Temp['Age Categories'] == 'Seniors', 'Sort'] = 2
Temp.sort_values(['Country','Sort'], inplace = True)
del Temp0
fig = px.bar(Temp, x= 'Country', y= 'count', barmode='group', color = 'Age Categories')
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 3e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.55, y=1.1))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp = Data19.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
Temp0 = Data17.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.sort_values(['FormalEducation','Year'])
Temp = Temp.astype('str')
fig = px.bar(Temp, y= 'FormalEducation', x= 'count', barmode='group', color = 'Year', orientation='h')
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.45, y=1.1))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Top = 5
Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data17.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)
del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
hover_data=["Country", "count"],
height=400, title='Responses in 2017')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()
Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data18.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)
del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
hover_data=["Country", "count"],
height=400, title='Responses in 2018')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()
Temp0 = Data19.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data19.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)
del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
hover_data=["Country", "count"],
height=400, title='Responses in 2019')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.imshow(WordCloud( background_color='white').generate(" ".join(Data19['CurrentJobTitle'].dropna())), interpolation='bilinear')
ax.axis('off')
ax.set_title('Job Titles in 2019',fontsize=20);
Temp = Data19.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
# Temp0 = Data17.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
# Temp0['Year'] = 2017
# Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.sort_values(['CurrentJobTitle','Year'])
Temp = Temp.astype('str')
fig = px.bar(Temp, y= 'CurrentJobTitle', x= 'count', barmode='group', color = 'Year', orientation='h',height=800)
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.75, y=1.1))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp = Data18['CurrentSalary'].str.split(pat = "-", expand=True)
Temp.columns = ['SalaryMin','SalaryMax']
Temp.SalaryMin = Temp.SalaryMin.str.replace('+', '').str.replace(',', '')
Temp.SalaryMin = Temp.SalaryMin.fillna(0)
Temp.SalaryMin = Temp.SalaryMin.astype(int)*(1000)
Temp.SalaryMax = Temp.SalaryMax.str.replace(',', '')
Temp18 = pd.concat([Data18['CurrentSalary'], Temp], axis=1)
Temp18.loc[Temp18.CurrentSalary.isin(['300-400,000','400-500,000']), ['CurrentSalary','SalaryMin']] = '300-500,000', int(3e5)
Group18 = Temp18.groupby(['CurrentSalary','SalaryMin'])['SalaryMin'].agg({'count'}).reset_index()\
.sort_values(['SalaryMin']).reset_index(drop = True)
del Temp
Temp = Data19['CurrentSalary'].str.split(pat = "-", expand=True)
Temp.columns = ['SalaryMin','SalaryMax']
Temp.SalaryMin = Temp.SalaryMin.str.replace('$', '').str.replace('> ', '').str.replace(',', '')
Temp.SalaryMin = Temp.SalaryMin.fillna(0)
Temp.SalaryMin = Temp.SalaryMin.astype(int)
Temp.SalaryMax = Temp.SalaryMax.str.replace(',', '')
Temp19 = pd.concat([Data19['CurrentSalary'], Temp], axis=1)
Temp19.loc[Temp19.CurrentSalary == '$0-999', 'CurrentSalary'] ='0-999'
Temp19.loc[Temp19.CurrentSalary == '> $500,000', 'CurrentSalary'] ='500,000+'
Group19 = Temp19.groupby(['CurrentSalary','SalaryMin'])['SalaryMin'].agg({'count'}).reset_index()\
.sort_values(['SalaryMin']).reset_index(drop = True)
del Temp
Temp = []
for s in Group18.SalaryMin.unique()[1:]:
Temp0 = Group19.loc[Group19.SalaryMin < s,'CurrentSalary'].tolist()
Temp0 = list(set(Temp0) - set(Temp))
Temp19.loc[Temp19.CurrentSalary.isin(Temp0), 'CurrentSalary'] = Group18.loc[Group18.SalaryMin < s,
'CurrentSalary'].tolist()[-1]
Temp.extend(Temp0)
Temp19.loc[Temp19.SalaryMin>= 5e5, 'CurrentSalary'] ='500,000+'
del Temp0, Group18
Group = Temp18.groupby(['CurrentSalary','SalaryMin'])['CurrentSalary'].agg({'count'}).reset_index()\
.sort_values(['SalaryMin']).reset_index(drop = True)
Group = Group[['CurrentSalary','count']].merge(Temp19.groupby(['CurrentSalary'])['CurrentSalary']\
.agg({'count'}).reset_index(drop = False),
left_on='CurrentSalary', right_on='CurrentSalary')
Group.columns = ['CurrentSalary', '2018', '2019']
Group = Group.melt(id_vars=['CurrentSalary'], value_vars=['2018','2019'], var_name='Year', value_name='Count')
fig = px.bar(Group19, x= 'CurrentSalary', y= 'count', text = 'count')
fig['layout']['yaxis'].update(range=[0, 1600])
fig.update_layout(title = 'The Number of Responses (2019)', yaxis_title="Count", xaxis_title="Current Salary")
fig.update_traces(marker_color='skyblue', marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = px.bar(Group, x= 'CurrentSalary', y= 'Count', color='Year', text = 'Count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses', yaxis_title="Count", xaxis_title="Current Salary")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.75, y=1.1))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp18 = pd.concat([Data18[['CurrentJobTitle','Country','FormalEducation']], Temp18], axis=1)
Temp19 = pd.concat([Data19[['CurrentJobTitle','Country','FormalEducation']], Temp19], axis=1)
Cols = ['CurrentSalary', 'SalaryMin', 'SalaryMax']
#
Temp18.loc[Temp18.CurrentSalary.isin(['10-20,000', '20-30,000', '30-40,000', '40-50,000']),
Cols] = '10-50,000', int(1e4), int(5e4)
Temp18.loc[Temp18.CurrentSalary.isin(['50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000']),
Cols] = '50-100,000', int(5e4), int(1e5)
Temp18.loc[Temp18.CurrentSalary.isin(['100-125,000', '125-150,000', '150-200,000']),
Cols] = '100-200,000', int(1e5), int(2e5)
Temp18.loc[Temp18.CurrentSalary.isin(['200-250,000', '250-300,000', '300-500,000', '500,000+']),
Cols] = '200,000+', int(2e5), np.nan
#
Temp19.loc[Temp19.CurrentSalary.isin(['10-20,000', '20-30,000', '30-40,000', '40-50,000']),
Cols] = '10-50,000', int(1e4), int(5e4)
Temp19.loc[Temp19.CurrentSalary.isin(['50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000']),
Cols] = '50-100,000', int(5e4), int(1e5)
Temp19.loc[Temp19.CurrentSalary.isin(['100-125,000', '125-150,000', '150-200,000']),
Cols] = '100-200,000', int(1e5), int(2e5)
Temp19.loc[Temp19.CurrentSalary.isin(['200-250,000', '250-300,000', '300-500,000', '500,000+']),
Cols] = '200,000+', int(2e5), np.nan
Temp = Temp18.groupby(['CurrentJobTitle','CurrentSalary','Country'])['CurrentSalary'].agg({'count'}).reset_index()
Temp['Sort'] = 0
Temp.loc[Temp.CurrentSalary == '10-50,000', 'Sort'] = 1
Temp.loc[Temp.CurrentSalary == '50-100,000', 'Sort'] = 2
Temp.loc[Temp.CurrentSalary == '100-200,000', 'Sort'] = 3
Temp.loc[Temp.CurrentSalary == '200,000+', 'Sort'] = 4
Temp.sort_values(['CurrentJobTitle','Country','Sort'], inplace = True)
Country = 'United States'
fig = px.bar(Temp.loc[Temp.Country == Country],
x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 500])
fig.update_layout(title = 'Salaries and Job Titles in 2018 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Country = 'Canada'
fig = px.bar(Temp.loc[Temp.Country == Country],
x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 70])
fig.update_layout(title = 'Salaries and Job Titles in 2018 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Temp = Temp19.groupby(['CurrentJobTitle','CurrentSalary','Country'])['CurrentSalary'].agg({'count'}).reset_index()
Temp['Sort'] = 0
Temp.loc[Temp.CurrentSalary == '10-50,000', 'Sort'] = 1
Temp.loc[Temp.CurrentSalary == '50-100,000', 'Sort'] = 2
Temp.loc[Temp.CurrentSalary == '100-200,000', 'Sort'] = 3
Temp.loc[Temp.CurrentSalary == '200,000+', 'Sort'] = 4
Temp.sort_values(['CurrentJobTitle','Country','Sort'], inplace = True)
Country = 'United States'
fig = px.bar(Temp.loc[Temp.Country == Country],
x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 500])
fig.update_layout(title = 'Salaries and Job Titles in 2019 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Country = 'Canada'
fig = px.bar(Temp.loc[Temp.Country == Country],
x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 60])
fig.update_layout(title = 'Salaries and Job Titles in 2019 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
def mysplit(Text, S):
_, Out = Text.split(S)
return Out
def mysplit2(Text):
Out, _, _ = Text.partition(' (')
return Out
S = 'Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - '
Temp18 = Data18[Search_df(Data18,S)]
Temp18.columns = [mysplit(x, S) for x in Temp18.columns]
Temp18 = Temp18.agg({'count'}).T.reset_index(drop = False)
Temp18.columns = ['Activities', 'Count']
Temp18['Year'] = '2018'
Temp19 = Data19[Search_df(Data19,S)]
Temp19.columns = [mysplit(x, S) for x in Temp19.columns]
Temp19 = Temp19.agg({'count'}).T.reset_index(drop = False)
Temp19.columns = ['Activities', 'Count']
Temp19['Year'] = '2019'
Temp = pd.concat([Temp18,Temp19])
del Temp18, Temp19
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6), sharey = True)
_ = sns.barplot(ax = ax, y= 'Activities', x= 'Count', hue='Year', edgecolor='k', hatch="///", data=Temp)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 12)
_ = ax.set_xlim([0,1e4])
_ = ax.legend(bbox_to_anchor=(0.7, 0.1), fontsize = 14)
S = 'Who/what are your favorite media sources that report on data science topics? (Select all that apply) - Selected Choice - '
Col = 'Media Sources'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.columns = [Col, 'Count']
Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='lightskyblue', marker_line_color='navy',
marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - '
Col = 'Data Science Courses'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.columns = [Col, 'Count']
Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='orchid', marker_line_color='indigo',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = """Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all that apply) - Selected Choice - """
Col = """IDE's"""
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='limegreen', marker_line_color='darkgreen',
marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'Notebook Host'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=400)
fig.update_traces(marker_color='bisque', marker_line_color='darkorange',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'Programming Languages'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='lightcoral', marker_line_color='darkred',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 14e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Visualization Libraries'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='whitesmoke', marker_line_color='dimgray',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
# fig.update_layout(plot_bgcolor= 'white')
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
# fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
# fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which types of specialized hardware do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Specialized Hardwares'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=300)
fig.update_traces(marker_color='pink', marker_line_color='mediumvioletred',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice -'
Col = 'ML Algorithms'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='mediumpurple', marker_line_color='darkred',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which categories of ML tools do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'ML Tools'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=450)
fig.update_traces(marker_color='lightgreen', marker_line_color='darkolivegreen',
marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'Computer Vision Methods'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
display(Temp.style.hide_index())
Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 5), sharey = True)
_ = sns.barplot(ax = ax, y= Col, x= 'Count', palette = 'summer', edgecolor='k', hatch="///", data=Temp)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 12)
_ = ax.set_xlim([0,3.5e3])
S = 'Which of the following natural language processing (NLP) methods do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'Natural Language Processing (NLP)'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=400)
fig.update_traces(marker_color='cornsilk', marker_line_color='darkgoldenrod',
marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 2.5e3])
fig.update_layout(title = '%s' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Machine Learning Frameworks'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='lightsalmon', marker_line_color='darkred',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Cloud Computing Platforms'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='yellowgreen', marker_line_color='darkgreen',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 3e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which specific cloud computing products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Cloud Computing Products'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='white', marker_line_color='dimgray',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 3.5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.show()
S = 'Which specific big data / analytics products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Big Data / Analytics Products'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='orchid', marker_line_color='royalblue',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 4.5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following machine learning products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Machine Learning Products'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='tomato', marker_line_color='dimgray',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which automated machine learning tools (or partial AutoML tools) do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Automated Machine Learning Tools'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='royalblue', marker_line_color='navy',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
S = 'Which of the following relational database products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Automated Machine Learning Tools'
Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='darkslategray', marker_line_color='black',
marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 4e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()